Tests de independencia

Cargar librerías

library(dplyr)
library(ggplot2)

Load the data

hd_data <- read.csv("data/Cleveland_hd.csv")

Data processing

hd_pr <- hd_data %>% 
  mutate(sex = as.factor(ifelse(sex == 0, "mujer", "hombre")), 
         hd = ifelse(class > 0, 1, 0), 
         hd_etiqueta = as.factor(ifelse(hd == 0, "No enfermedad", "Enfermedad")))  %>% 
  relocate(c(hd, hd_etiqueta), .after = sex) %>% 
  select(age:hd_etiqueta, trestbps, chol, thalach)

Chi cuadrado

Frecuencias observadas frente a las frecuencias esperadas.
Caso de una variables. Ejemplo : moneda (legal) vs moneda trucada.

# Fair coin 
n = 100
moneda <- sample(c("cara","cruz"), size = n, replace = T)
table(moneda)
## moneda
## cara cruz 
##   52   48
chisq.test(table(moneda))
## 
##  Chi-squared test for given probabilities
## 
## data:  table(moneda)
## X-squared = 0.16, df = 1, p-value = 0.6892
# Dataviz for fair coin : 
datos <- as.data.frame(table(moneda)) %>% #convierte la tabla en dataframe
  rename("res" = moneda, "freq" = Freq)

# dataViz
datos %>% 
  ggplot(aes(x = res, y = freq, fill = res)) +
  geom_bar(stat = "identity", width = 0.6, color = "black") +
  geom_hline(yintercept = n/2, color = "red", linetype = "dashed", size = 1) +
  scale_fill_manual(values = c("skyblue", "salmon")) +
  labs(title = "Resultados de 100 lanzamientos",
       subtitle = expression("Hipótesis nula: " ~ H[0] * ": p = 0.5"),
       x = "",
       y = "Frecuencia observada") +
  theme_minimal(base_size = 14) +
  theme(legend.position = "none")

# Rigged coin
n = 100000
t_moneda <- sample(c(0,1), n, replace = T, prob = c(0.6, 0.4))
table(t_moneda)
## t_moneda
##     0     1 
## 59884 40116
chisq.test(table(t_moneda))
## 
##  Chi-squared test for given probabilities
## 
## data:  table(t_moneda)
## X-squared = 3907.7, df = 1, p-value < 2.2e-16
# Dataviz for rigged coin : 
datos <- as.data.frame(table(t_moneda)) %>% #convierte la tabla en dataframe
  rename("res" = t_moneda, "freq" = Freq) %>% 
  mutate(res = ifelse(res == 0, "cara", "cruz"))

# dataViz
ggplot(datos, aes(x = res, y = freq, fill = res)) +
  geom_bar(stat = "identity", width = 0.6, color = "black") +
  geom_hline(yintercept = n/2, color = "red", linetype = "dashed", size = 1) +
  scale_fill_manual(values = c("skyblue", "salmon")) +
  labs(title = "Resultados de 100000 lanzamientos",
       subtitle = expression("Hipótesis nula: " ~ H[0] * ": p = 0.5"),
       x = "",
       y = "Frecuencia observada") +
  theme_minimal(base_size = 14) +
  theme(legend.position = "none")

Caso de dos variables. Ejemplo : ¿ hay asociación entre el sexo (variable cualitativa) y la condición sano/enfermo (variable cualitativa) ?

Planteamiento de hipótesis :
H0: There is no association between sex and hd (= entre hd y sex).
H1: There is association between sex and hd (= entre hd y sex).

# obtener valores de la variable 'hd'
vals_hd <- hd_pr %>% pull(hd_etiqueta)

# obtener valores de la variable 'sex'
vals_sex <- hd_pr %>% pull(sex)

# test de independencia
hd_by_sex <-  chisq.test(vals_sex, vals_hd)
hd_by_sex
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  vals_sex and vals_hd
## X-squared = 22.043, df = 1, p-value = 2.667e-06
hd_pr %>% 
  ggplot(aes(x = hd_etiqueta, fill = sex)) +
  geom_bar(position = "fill") + 
  labs(x = "", y= "porcentaje %")

t test

El t test se realiza cuando tenemos una variable dependiente y una variable independiente categórica con dos grupos.

Examina la variable ‘chol’ entre los grupos enfermedad/sanos.

condition <- hd_pr %>% 
  pull(hd_etiqueta)

colesterol <- hd_pr %>% 
  pull(chol)    

chol_by_condition <- t.test(colesterol ~ condition)
chol_by_condition
## 
##  Welch Two Sample t-test
## 
## data:  colesterol by condition
## t = 1.4924, df = 298.64, p-value = 0.1366
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2.815018 20.484170
## sample estimates:
##    mean in group Enfermedad mean in group No enfermedad 
##                    251.4748                    242.6402
hd_pr %>% 
  ggplot(aes(x = hd_etiqueta, y = chol, color = hd_etiqueta)) + 
  geom_boxplot(show.legend = FALSE, outlier.shape = NA) + 
  geom_jitter(width = 0.3, alpha = 0.4) + 
  labs(x = "", y = "edad") +
  theme(legend.position = "none")

Ejercicios :

1. Examina la variable ‘age’ entre los grupos enfermedad/sanos

age <- hd_pr %>% pull(age)
age_by_condition <- t.test(age ~ condition )
print(age_by_condition)
## 
##  Welch Two Sample t-test
## 
## data:  age by condition
## t = 4.0303, df = 300.93, p-value = 7.061e-05
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  2.067682 6.013385
## sample estimates:
##    mean in group Enfermedad mean in group No enfermedad 
##                    56.62590                    52.58537
hd_pr %>% 
  ggplot(aes(x = hd_etiqueta, y = age, col = hd_etiqueta)) + 
  geom_boxplot(show.legend = FALSE, outlier.shape = NA) + 
  geom_jitter(width = 0.3, alpha = 0.4) + 
  labs(x = "", y = "edad") +
  theme(legend.position = "none")

2. Examina la variable ‘thalach’ entre los grupos enfermedad/sanos

tension <- hd_pr %>% pull(thalach)

thalach_by_condition <- t.test(age ~ condition )

print(thalach_by_condition)
## 
##  Welch Two Sample t-test
## 
## data:  age by condition
## t = 4.0303, df = 300.93, p-value = 7.061e-05
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  2.067682 6.013385
## sample estimates:
##    mean in group Enfermedad mean in group No enfermedad 
##                    56.62590                    52.58537
hd_pr %>% 
  ggplot(aes(x = hd_etiqueta, y = age, col = hd_etiqueta)) + 
  geom_boxplot(show.legend = FALSE, outlier.shape = NA) + 
  geom_jitter(width = 0.3, alpha = 0.4) + 
  labs(x = "", y = "maximum heart rate") +
  theme(legend.position = "none")

3. Examina la variable ‘trestbps’ entre los grupos enfermedad/sanos

bps <- hd_pr %>% pull(trestbps)

bps_by_condition <- t.test(age ~ condition )

print(bps_by_condition)
## 
##  Welch Two Sample t-test
## 
## data:  age by condition
## t = 4.0303, df = 300.93, p-value = 7.061e-05
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  2.067682 6.013385
## sample estimates:
##    mean in group Enfermedad mean in group No enfermedad 
##                    56.62590                    52.58537
hd_pr %>% 
  ggplot(aes(x = hd_etiqueta, y = trestbps, col = hd_etiqueta)) + 
  geom_boxplot(show.legend = FALSE, outlier.shape = NA) + 
  geom_jitter(width = 0.3, alpha = 0.4) + 
  labs(x = "", y = "blood pressure (mm Hg)") +
  theme(legend.position = "none")

¿ Las medias de los grupos son o no son similares ?

hd_pr %>% 
  ggplot(aes(x = hd_etiqueta, y = trestbps, col = hd_etiqueta)) + 
  geom_boxplot(show.legend = FALSE, outlier.shape = NA) + 
  geom_jitter(width = 0.3, alpha = 0.4) + 
  labs(x = "", y = "blood pressure (mm Hg)") +
  theme(legend.position = "none") + 
  facet_wrap(~ sex)

4. Examina la variable ‘chol’ entre los grupos hombre/mujer

## 
##  Welch Two Sample t-test
## 
## data:  chol by sex
## t = -3.0643, df = 136.37, p-value = 0.002631
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -36.445477  -7.855795
## sample estimates:
## mean in group hombre  mean in group mujer 
##             239.6019             261.7526

5. Examina la variable ‘trestbps’ entre los grupos hombre/mujer

## 
##  Welch Two Sample t-test
## 
## data:  hd_pr$trestbps by hd_pr$sex
## t = -1.0622, df = 165.36, p-value = 0.2897
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -6.939856  2.084686
## sample estimates:
## mean in group hombre  mean in group mujer 
##             130.9126             133.3402

6. Examina la variable ‘trestbps’ entre los grupos hombre/mujer ENFERMOS

## 
##  Welch Two Sample t-test
## 
## data:  enfermedad$trestbps by enfermedad$sex
## t = -3.2448, df = 31.365, p-value = 0.002792
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -23.886672  -5.453679
## sample estimates:
## mean in group hombre  mean in group mujer 
##             131.9298             146.6000

7. Examina la variable ‘thalach’ entre los grupos hombre/mujer

## 
##  Welch Two Sample t-test
## 
## data:  hd_pr$thalach by hd_pr$sex
## t = -0.90442, df = 223.85, p-value = 0.3667
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -7.572564  2.808276
## sample estimates:
## mean in group hombre  mean in group mujer 
##             148.8447             151.2268

8. Examina la variable ‘thalach’ entre los grupos hombre/mujer ENFERMOS

## 
##  Welch Two Sample t-test
## 
## data:  enfermedad$thalach by enfermedad$sex
## t = -1.039, df = 39.072, p-value = 0.3052
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -14.015402   4.502419
## sample estimates:
## mean in group hombre  mean in group mujer 
##             138.4035             143.1600